Setup

Reading in files

# Get the list of CSV files in the 'csv_cache' directory
csv_files <- list.files(path = "csv_cache", pattern = "*.csv", full.names = TRUE)

# Initialize an empty data frame to store the combined data
combined_df <- data.frame()

# Loop through each file in the csv_files list
for (file in csv_files) {
  # Read the CSV file
  location_data <- read.csv(file)
  
  # Categorize distance
  location_data$"distance_to_train_station(km)" <- location_data$distance_to_train_station / 1000
  
  # Classing distance
  location_data$distance_class <- cut(location_data$"distance_to_train_station(km)",
                                      breaks = c(0, 0.250, 0.500, 0.750, 1.000, 1.250, 1.500, 1.750, 2.000, 2.250, 2.500, 3.000, 3.250, 3.500, 3.750, 4.000))
  
  # Combine the processed data frame with the combined_df data frame
  combined_df <- rbind(combined_df, location_data)
}

# Inspect the combined data frame
head(combined_df)
##   House_ID                                address bedroom bathroom carspace
## 1        1  10/92 Buckland Street Alexandria 2015       3        2        2
## 2        2   6/92 Buckland Street Alexandria 2015       3        2        2
## 3        3 14/18-20 Newton Street Alexandria 2015       2        1        1
## 4        4   PG09/11 Power Avenue Alexandria 2015       2        2        1
## 5        5      28A Gerard Street Alexandria 2015       3        3       NA
## 6        6   1/92 Buckland Street Alexandria 2015       3        2        2
##   soldprice   yearsold  latitude longitude distance_to_train_station
## 1   2350000 2023-03-15 -33.90043  151.1947                  967.6896
## 2   2350000 2023-01-30 -33.90043  151.1947                  967.6896
## 3   1140000 2022-07-30 -33.89918  151.1909                 1325.9023
## 4   1550000 2022-07-06 -33.90129  151.1983                  669.3040
## 5   1230000 2022-06-18 -33.89771  151.1969                 1077.5788
## 6   2425000 2022-05-06 -33.90043  151.1947                  967.6896
##   distance_to_train_station(km) distance_class
## 1                     0.9676896       (0.75,1]
## 2                     0.9676896       (0.75,1]
## 3                     1.3259023     (1.25,1.5]
## 4                     0.6693040     (0.5,0.75]
## 5                     1.0775788       (1,1.25]
## 6                     0.9676896       (0.75,1]
tail(combined_df)
##       House_ID                                   address bedroom bathroom
## 29684      810     7/24 Methven Street Mount Druitt 2770       3        1
## 29685      811     9/41 Methven Street Mount Druitt 2770       3        1
## 29686      812       1/21 Hythe Street Mount Druitt 2770       3        1
## 29687      813     1/14 Meacher Street Mount Druitt 2770       3        1
## 29688      814      4/34 Durham Street Mount Druitt 2770       3        1
## 29689      815 49/334 Woodstock Avenue Mount Druitt 2770       3       NA
##       carspace soldprice   yearsold  latitude longitude
## 29684        1    154000 2001-09-01 -33.76247  150.8216
## 29685        1    156000 2001-09-01 -33.76186  150.8249
## 29686        1    400000 2001-08-01 -33.76308  150.8211
## 29687        1    189000 2001-08-01 -33.76042  150.8203
## 29688        1    178000 2001-07-01 -33.77166  150.8122
## 29689       NA    124000 2001-06-01 -33.75742  150.8206
##       distance_to_train_station distance_to_train_station(km) distance_class
## 29684                  802.9798                     0.8029798       (0.75,1]
## 29685                  967.3295                     0.9673295       (0.75,1]
## 29686                  729.2326                     0.7292326     (0.5,0.75]
## 29687                 1019.4336                     1.0194336       (1,1.25]
## 29688                  768.9916                     0.7689916       (0.75,1]
## 29689                 1354.9054                     1.3549054     (1.25,1.5]

Filtering Data

# Check the column names of combined_df
cat("Column names in combined_df:\n")
## Column names in combined_df:
print(colnames(combined_df))
##  [1] "House_ID"                      "address"                      
##  [3] "bedroom"                       "bathroom"                     
##  [5] "carspace"                      "soldprice"                    
##  [7] "yearsold"                      "latitude"                     
##  [9] "longitude"                     "distance_to_train_station"    
## [11] "distance_to_train_station(km)" "distance_class"
combined_df_1bed <-filter(combined_df, bedroom ==1)
combined_df_2bed <-filter(combined_df, bedroom ==2)
combined_df_3bed <-filter(combined_df, bedroom ==3)
combined_df_4bed <-filter(combined_df, bedroom ==4)
combined_df_5bed <-filter(combined_df, bedroom ==5)
par(mfrow=c(1,2))
ggplot(combined_df_1bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 1 Bedroom", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df_1bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
  labs(title = "Sold Price vs Distance from Train Station for 1 Bedroom", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df_2bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_2bed$soldprice)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 5.000e+04 3.500e+05 4.680e+05 7.895e+05 6.100e+05 2.147e+09
ggplot(combined_df_2bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
  labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_2bed$soldprice)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 5.000e+04 3.500e+05 4.680e+05 7.895e+05 6.100e+05 2.147e+09
ggplot(combined_df_3bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_3bed$soldprice)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      575   425000   570000   641588   745000 22867454
ggplot(combined_df_3bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
  labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_3bed$soldprice)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      575   425000   570000   641588   745000 22867454
ggplot(combined_df_4bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_4bed$soldprice)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1650   534050   675000   762421   865000 15000000
ggplot(combined_df_4bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
  labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_4bed$soldprice)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1650   534050   675000   762421   865000 15000000
ggplot(combined_df_5bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_5bed$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  200000  659990  783000  864065  930000 3080000
ggplot(combined_df_5bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
  labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_5bed$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  200000  659990  783000  864065  930000 3080000

Filtering Data by Carspaces and Bedrooms

combined_df_1bed_1car <-filter(combined_df, bedroom ==1, carspace == 1)

combined_df_2bed_1car <-filter(combined_df, bedroom ==2, carspace == 1)

combined_df_2bed_2car <-filter(combined_df, bedroom ==2, carspace == 2)

combined_df_3bed_1car <-filter(combined_df, bedroom ==3, carspace == 1)

combined_df_3bed_2car <-filter(combined_df, bedroom ==3, carspace == 2)

combined_df_3bed_3car <-filter(combined_df, bedroom ==3, carspace == 3)

combined_df_3bed_4car <-filter(combined_df, bedroom ==3, carspace == 4)

combined_df_4bed_1car <-filter(combined_df, bedroom ==4, carspace == 1)

combined_df_4bed_2car <-filter(combined_df, bedroom ==4, carspace == 2)

combined_df_4bed_3car <-filter(combined_df, bedroom ==4, carspace == 3)

combined_df_4bed_4car <-filter(combined_df, bedroom ==4, carspace == 4)

combined_df_5bed_1car <-filter(combined_df, bedroom ==5, carspace == 1)

combined_df_5bed_2car <-filter(combined_df, bedroom ==5, carspace == 2)

combined_df_5bed_3car <-filter(combined_df, bedroom ==5, carspace == 3)

1 bedroom

ggplot(combined_df_1bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 1 Bedroom and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1bed_1car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   61325  253000  375000  402900  487225 1330000

2 bedrooms

ggplot(combined_df_2bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2bed_1car$soldprice)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 5.700e+04 3.400e+05 4.600e+05 8.587e+05 5.990e+05 2.147e+09
ggplot(combined_df_2bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2bed_2car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   66000  432500  547500  588914  700000 1581000

3 bedrooms

ggplot(combined_df_3bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_1car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   50000  360000  500000  549157  653575 5850000
ggplot(combined_df_3bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_2car$soldprice)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      575   499000   635000   722361   836625 22867454
ggplot(combined_df_3bed_3car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 3 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_2car$soldprice)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      575   499000   635000   722361   836625 22867454
ggplot(combined_df_3bed_4car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 4 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_4car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  305000  477750  590000  669315  813500 1600000

4 bedrooms

ggplot(combined_df_4bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_1car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  175000  448500  620000  659640  767500 2750000
ggplot(combined_df_4bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_2car$soldprice)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1650   555000   690000   790872   890000 15000000
ggplot(combined_df_4bed_3car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 3 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_3car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  100000  611750  769000  921054 1001000 3000000
ggplot(combined_df_4bed_4car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 4 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_4car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  330000  535250  637500  684617  807500 1430000

5 bedrooms

ggplot(combined_df_5bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_5bed_1car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  335000  602500  722500  743607  870000 1850000
ggplot(combined_df_5bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_5bed_2car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  200000  663748  820000  900051  960416 3080000
ggplot(combined_df_5bed_3car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms and 3 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_5bed_3car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  700000  738000  790000  787833  838250  910000

Creating a column for Year

combined_df$Year <- as.factor(format(as.Date(combined_df$yearsold), "%Y"))
# Filtering by year
combined_df_0.00 <-filter(combined_df, distance_class == "(0,0.25]")
combined_df_0.25 <-filter(combined_df, distance_class == "(0.25,0.5]")
combined_df_0.50 <-filter(combined_df, distance_class == "(0.5,0.75]")
combined_df_0.75 <-filter(combined_df, distance_class == "(0.75,1]")
combined_df_1.00 <-filter(combined_df, distance_class == "(1,1.25]")
combined_df_1.25 <-filter(combined_df, distance_class == "(1.25,1.5]")
combined_df_1.50 <-filter(combined_df, distance_class == "(1.5,1.75]")
combined_df_1.75 <-filter(combined_df, distance_class == "(1.75,2]")
combined_df_2.00 <-filter(combined_df, distance_class == "(2,2.25]")
combined_df_2.25 <-filter(combined_df, distance_class == "(2.25,2.5]")
combined_df_2.50 <-filter(combined_df, distance_class == "(2.5,2.75]")
combined_df_2.75 <-filter(combined_df, distance_class == "(2.75,3]")
combined_df_3.00 <-filter(combined_df, distance_class == "(3,3.25]")
combined_df_3.25 <-filter(combined_df, distance_class == "(3.25,3.5]")
combined_df_3.50 <-filter(combined_df, distance_class == "(3.5,3.75]")
combined_df_3.75 <-filter(combined_df, distance_class == "(3.75,4]")
ggplot(combined_df_0.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 0 to 0.25km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.00$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   50000  415000  565000  655697  780000 3300000
ggplot(combined_df_0.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 0.25 to 0.50km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.25$soldprice)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 5.000e+04 4.500e+05 5.980e+05 1.106e+06 7.918e+05 2.147e+09
ggplot(combined_df_0.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 0.50 to 0.75km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.50$soldprice)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1650   425000   580000   650450   775000 15000000
ggplot(combined_df_0.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 0.75 to 1.00km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.75$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   60000  420000  547000  609743  712000 6203000
ggplot(combined_df_1.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 1.00 to 1.25km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.00$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     575  392250  540000  606655  719000 4400000
ggplot(combined_df_1.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 1.25 to 1.50km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.25$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   61325  378000  524000  572350  680000 4840000
ggplot(combined_df_1.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 1.50 to 1.75km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.50$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   92000  374462  522250  563676  650000 2812000
ggplot(combined_df_1.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 1.75 to 2.00km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.75$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  100000  359250  509500  566946  680000 3100000
ggplot(combined_df_2.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 2.00 to 2.25km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.00$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  132500  370000  490275  529213  620000 2430000
ggplot(combined_df_2.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 2.25 to 2.50km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.25$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  190000  382500  550000  592289  687000 5346000
ggplot(combined_df_2.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 2.50 to 2.75km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.50$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 
ggplot(combined_df_2.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 2.75 to 3.00km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.75$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 
ggplot(combined_df_3.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 3.00 to 3.25km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.00$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  180000  357500  500101  538112  600000 1777000
ggplot(combined_df_3.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 3.25 to 3.75km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.25$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  225000  300000  460000  446274  543000 1125000
ggplot(combined_df_3.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 3.50 to 3.75km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.50$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  250000  300000  355000  386685  441000  664000
ggplot(combined_df_3.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 3.75 to 4.00km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.75$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 
ggplot(combined_df, aes(x = Year, y = soldprice/100000))+
    geom_point(aes(color=distance_class)) +
    labs(title = "Sold Price over Years", x="Year", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
    theme_bw()+
    theme(axis.text.x = element_text(angle=45,hjust=1))+
    theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df, aes(x = Year, y = soldprice/100000))+
    geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
    labs(title = "Sold Price over Years", x="Year", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
    theme_bw()+
    theme(axis.text.x = element_text(angle=45,hjust=1))+
    theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df, aes(x = factor(bedroom), y = soldprice/100000))+
    geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
    labs(title = "Sold Price for Different Numbers of Bedrooms", x="Number of Bedrooms", y="Selling Price (x$100000)")+
    theme_bw()+
    theme(axis.text.x = element_text(angle=45,hjust=1))+
    theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df, aes(x = factor(bathroom), y = soldprice/100000))+
    geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
    labs(title = "Sold Price for Different Numbers of Bathrooms", x="Number of Bathrooms", y="Selling Price (x$100000)")+
    theme_bw()+
    theme(axis.text.x = element_text(angle=45,hjust=1))+
    theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df, aes(x = factor(carspace), y = soldprice/100000))+
    geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
    labs(title = "Sold Price for Different Numbers of Carspaces", x="Number of Carspaces", y="Selling Price (x$100000)")+
    theme_bw()+
    theme(axis.text.x = element_text(angle=45,hjust=1))+
    theme(plot.title = element_text(hjust=0.25))

Two added graphs from Jasmine Mon Apr 17, 2023 7 pm

q1 <- quantile(combined_df$soldprice, 0.25)
q3 <- quantile(combined_df$soldprice, 0.75)
iqr <- q3 - q1
combined <- subset(combined_df, soldprice >= q1 - 1.5*iqr & soldprice <= q3 + 1.5*iqr)

# I changed the `na.rm` to be TRUE to remove all invalid N/A data points
Q1 <- quantile(combined_df$`distance_to_train_station(km)`, 0.25, na.rm = TRUE)
Q3 <- quantile(combined_df$`distance_to_train_station(km)`, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1

# What I've changed here at 7:05 AM, Apr 17, 2023, Monday
# `subset(combined_df ...` <- `subset(combined, ...`
combined <- subset(combined_df, `distance_to_train_station(km)` >= Q1 - 1.5*IQR & `distance_to_train_station(km)` <= Q3 + 1.5*IQR)

ggplot(combined, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

model <- lm(soldprice ~ `distance_to_train_station(km)`, data = combined)
plot(combined$"distance_to_train_station(km)", resid(model), main = "Residual Plot", xlab = "Distance to train station (km)", ylab = "Residuals", cex=0.15)
abline(h=0)